In [1]:
from bertopic import BERTopic

topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
import pandas as pd

df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
In [3]:
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
Out[3]:
Topic Name Top_n_words Probability Representative_document
0 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.098617 False
1 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.186230 False
2 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.397535 False
3 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.186223 False
4 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.154408 False
... ... ... ... ... ...
63023 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.957031 False
63024 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.958408 False
63025 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.470342 False
63026 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.151238 False
63027 -1 -1_energy_development_management_measure energy - development - management - measure - ... 0.247359 False

63028 rows × 5 columns

In [4]:
counts = {}
for doc in docs:
    for word in doc.split():
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))
energy    32557
land      26785
development22243
plan      16386
forest    16345
national  15246
agricultural14260
management14175
environmental14088
activity  13607
public    13302
establish 13100
purpose   12904
article   12576
protection11571
policy    10876
resource  10656
project   10645
sector    10475
measure   10026
production 9984
system     9774
power      9735
product    9501
set        9461
provide    9039
emission   8984
water      8951
environment 8878
procedure  8671
include    8475
natural    8443
consist    8282
regulation 8186
gas        8008
promote    7947
service    7866
condition  7784
support    7747
requirement 7626
control    7420
renewable  7393
electricity 7231
application 7224
sustainable 7014
organization 6967
implementation 6950
aim        6861
rule       6731
ensure     6729
efficiency 6713
economic   6623
government 6618
regulate   6473
carry      6467
standard   6391
establishes 6336
relate     6250
legal      6119
plant      6115
operation  6014
objective  5929
grant      5924
implement  5871
rural      5809
authority  5751
person     5731
program    5658
process    5527
building   5515
market     5426
action     5421
strategy   5383
create     5326
local      5293
``         5242
function   5187
minister   5171
right      5170
tax        5153
waste      5143
quality    5101
property   5085
conservation 5071
text       5037
fuel       4956
increase   4947
investment 4932
agreement  4904
source     4841
technical  4828
supply     4799
form       4736
fund       4733
divide     4693
framework  4690
registration 4664
issue      4662
protect    4651
reduce     4640
In [5]:
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[5]:
[('grassland', 0.7370147131588162),
 ('permanent', 0.16952735063090357),
 ('supplementary', 0.08571556527024189),
 ('prohibition', 0.06655119250597118),
 ('liability', 0.05962473348425665),
 ('implement', 0.05679452995567585),
 ('ordinance', 0.04459698230176548),
 ('farming', 0.03570748090982378),
 ('afforestation', 0.034822982723819756),
 ('reforestation', 0.033169812063301087)]
In [6]:
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[6]:
[('fishery', 0.13944542494364653),
 ('agriculture', 0.07943544997415505),
 ('aquatic', 0.05477908036914138),
 ('fishing', 0.05304962776556011),
 ('fisheries', 0.04572184777019522),
 ('aquaculture', 0.037815564170875074),
 ('forestry', 0.037434174371047554),
 ('shellfish', 0.023419507524976874),
 ('concern', 0.02104295450374762),
 ('habitat', 0.018744302196177558)]
In [7]:
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[7]:
[('biodiversity', 0.18095997991522123),
 ('biological', 0.04911069515872883),
 ('sectoral', 0.024109475728398406),
 ('conserve', 0.023615540593107116),
 ('strategic', 0.022708791554987603),
 ('specie', 0.020704883791587),
 ('objective', 0.019437590501399926),
 ('management', 0.018636830995872957),
 ('genetic', 0.01731194493947348),
 ('equitable', 0.017097970345597457)]
In [8]:
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[8]:
[('energy', 0.010344741577772185),
 ('management', 0.008305760013746135),
 ('development', 0.008278847252110371),
 ('mineral', 0.007564087707767394),
 ('resource', 0.00745341265083421),
 ('protection', 0.007203174117237951),
 ('property', 0.006589749077854606),
 ('vehicle', 0.0062958130351013514),
 ('timber', 0.006279371535782798),
 ('exploration', 0.006189109348312551)]
In [9]:
len(docs)
Out[9]:
63028
In [10]:
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
    os.makedirs(images_path)
In [11]:
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
In [12]:
fig = topic_model.visualize_barchart(top_n_topics=20, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.svg')

# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
In [13]:
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
In [14]:
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_heatmap.svg')
fig2
In [15]:
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_visualize_topics.svg')
fig3
In [16]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_hierarchical_topics.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    hierarchical_topics.to_excel(writer)

hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_hierarchical_topics.xlsx")

fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_hierarchical_topics.svg')
fig4
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [20:48<00:00, 21.53s/it]
In [17]:
for index, i in enumerate(timestamp):
    if i == '0':
        timestamp[index] = '2020'
    else:
        timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_topics_over_time.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    topics_over_time.to_excel(writer)
19it [2:36:39, 494.73s/it] 
In [18]:
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_topics_over_time.xlsx")
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_visualize_topics_over_time.svg')
fig5
In [ ]: